I was reading an article by Masahiro Uesaka, Shigeru Kuratani, Naoki Irie and fig 1c (as well as some of the author’s work on recapitulation) inspired me to check what is the level of correlation across developmental stages - not just to see which stages correspond to which.
This analysis is also useful when we consider that the ‘phylotypic phase’ should inherently be more conserved and thus we also see a strong similarity between E4 in Fucus distichus and 5w in F. serratus, though E5 in Fucus distichus (which would be the most conserved stage in F. distichus) and 3w in F. serratus. From this, we can gather what is more accurate.
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggfortify)
sample_info_fd <-
readr::read_csv("data/sample_info_fd.csv")
## Rows: 34 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): SampleName, SeqProt, Species, stage_tissue, Stage, Tissue, Experime...
## dbl (1): Replicate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_info_fs <-
readr::read_csv("data/sample_info_fs.csv")
## Rows: 48 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): SampleName, SeqProt, Species, stage_tissue, Stage, Tissue, Experime...
## dbl (1): Replicate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_info_ec <-
readr::read_csv("data/sample_info_ec.csv")
## Rows: 57 Columns: 10
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): SampleName, SeqProt, Species, stage_tissue, Stage, Tissue, Experime...
## dbl (1): Replicate
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sample_info_fd_embryo <-
sample_info_fd %>%
dplyr::filter(Stage %in% c("E1", "E2", "E3", "E4", "E5", "E6")) %>%
dplyr::mutate(LibLab = str_c(Species, "_", Stage, "_", Replicate))
sample_info_fs_embryo <-
sample_info_fs %>%
dplyr::filter(Stage %in% c("24H", "48H", "1w", "3w", "4w")) %>%
dplyr::mutate(Replicate = case_when(
LibName == "Fs_X_zygotes_48h_1" ~ 4,
LibName == "Fs_X_zygotes_48h_2" ~ 5,
LibName == "Fs_X_zygotes_48h_3" ~ 6,
TRUE ~ Replicate
)) %>%
mutate(LibLab = str_c(Species, "_", Stage, "_", Replicate))
Fs_abundance <-
readr::read_csv(file = "data/Fs_OG_abundance.csv")
## Rows: 5596 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): OG
## dbl (48): Fs_F_gametes_1, Fs_F_gametes_2, Fs_F_gametes_3, Fs_M_gametes_1, Fs...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Fs_counts <-
readr::read_csv(file = "data/Fs_OG_counts.csv")
## Rows: 5596 Columns: 49
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): OG
## dbl (48): Fs_F_gametes_1, Fs_F_gametes_2, Fs_F_gametes_3, Fs_M_gametes_1, Fs...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Fd_abundance <-
readr::read_csv(file = "data/Fd_OG_abundance.csv")
## Rows: 5410 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): OG
## dbl (34): Fd_f_gametes_1, Fd_f_gametes_2, Fd_f_gametes_3, Fd_m_gametes_1, Fd...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Fd_counts <-
readr::read_csv(file = "data/Fd_OG_counts.csv")
## Rows: 5410 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): OG
## dbl (34): Fd_f_gametes_1, Fd_f_gametes_2, Fd_f_gametes_3, Fd_m_gametes_1, Fd...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Ec_abundance <-
readr::read_csv(file = "data/Ec_OG_abundance.csv")
## Rows: 7059 Columns: 58
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): OG
## dbl (57): M_meiospore_low_1, M_meiospore_low_2, M_meiospore_low_3, F_meiospo...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Ec_counts <-
readr::read_csv(file = "data/Ec_OG_counts.csv")
## Rows: 7059 Columns: 58
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): OG
## dbl (57): M_meiospore_low_1, M_meiospore_low_2, M_meiospore_low_3, F_meiospo...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
selectGenes <- function(counts, min.count=2){
keep <-
counts[rowMeans(counts)>min.count, ]
return(keep)
}
To minimise differences between species, I remove OGs with low counts.
combined_metatable <-
rbind(sample_info_fs_embryo, sample_info_fd_embryo) %>%
relocate(LibName)
merged_TPM_pre <-
merge(Fs_abundance, Fd_abundance, by = "OG") %>%
dplyr::select(1, combined_metatable$LibName)
merged_TPM <-
data.frame(row.names = merged_TPM_pre[,1], merged_TPM_pre[,-1], check.names = FALSE)
merged_TPM.filt <-
as.matrix(merged_TPM) %>%
selectGenes(min.count=10)
Through the filtering function, I retain 2889 / 3724 OGs
rlog_TPM <- DESeq2::rlog(merged_TPM.filt %>% round(0))
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## converting counts to integer mode
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
## function: y = a/x + b, and a local regression fit was automatically substituted.
## specify fitType='local' or 'mean' to avoid this message next time.
log2_TPM <- log2(merged_TPM.filt+1)
sqrt_TPM <- sqrt(merged_TPM.filt)
# tpm10_TPM <- (merged_TPM.filt > 10) * 1
Here, we use distance metrics to quantify how distant the stages are from each other across Fucus species and maybe even in Ectocarpus.
library(philentropy)
ncol_Fs <- grep( "Fs" , colnames( sqrt_TPM ) )
ncol_Fd <- grep( "Fd" , colnames( sqrt_TPM ) )
rlog_TPM_renamed <- rlog_TPM
base::colnames(rlog_TPM_renamed) <- combined_metatable$LibLab
Linear relationship
M = cor(x = rlog_TPM_renamed[,ncol_Fs],
y = rlog_TPM_renamed[,ncol_Fd],
method = "pearson")
M_melt <- reshape2::melt(M)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = 1 - median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus stages",
y = "1 - median(correlation with Fd stages)",
title = "Pearson correlation (rlog)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fd, y = 1 - median_corr, colour = Fs, group = Fs)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus distichus stages",
y = "1 - median(correlation with Fs stages)",
title = "Pearson correlation (rlog)",
colour = "Fs stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Monotonic relationship
rlog_TPM_renamed <- rlog_TPM
base::colnames(rlog_TPM_renamed) <- combined_metatable$LibLab
M = cor(x = rlog_TPM_renamed[,ncol_Fs],
y = rlog_TPM_renamed[,ncol_Fd],
method = "spearman")
M_melt <- reshape2::melt(M)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = 1 - median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus stages",
y = "1 - median(correlation with Fd stages)",
title = "Spearman correlation (rlog)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fd, y = 1 - median_corr, colour = Fs, group = Fs)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus distichus stages",
y = "1 - median(correlation with Fs stages)",
title = "Spearman correlation (rlog)",
colour = "Fs stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Manhattan distance is a distance metric used to measure the distance between two points in a grid-like system, such as a city block or a chessboard. It is calculated as the sum of the absolute differences between the coordinates of the two points. The term “Manhattan” is used because the grid-like structure and the right-angled turns of the streets in Manhattan, New York, resemble a chessboard. The Manhattan distance is also known as the L1 norm or taxicab distance. It is commonly used in machine learning and data science to measure similarity between data points or to cluster data.
It is L_{1} norm unlike Euclidean L_{2} norm.
Because it uses absolute values instead of exponentiation and rooting, it is said to be more robust to outliers compared to the euclidean distance. Additionally, the modulus is much faster to compute than exponentiation. It is a special case of the Minkowski distance.
DM <- philentropy::distance(t(rlog_TPM_renamed), use.row.names = TRUE, method = "manhattan")
## Metric: 'manhattan'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]
M_melt <- reshape2::melt(DM2)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus stages",
y = "median(distance with Fd stages)",
title = "Manhattan distance (rlog)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fd, y = median_corr, colour = Fs, group = Fs)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus distichus stages",
y = "median(distance with Fs stages)",
title = "Manhattan distance (rlog)",
colour = "Fs stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Euclidean distance is a measure of the straight-line distance between two points in a multi-dimensional space. It is calculated as the square root of the sum of the squared differences between the corresponding coordinates of the two points. The Euclidean distance is commonly used in clustering and classification algorithms, as well as in distance-based data analysis and visualization techniques.
Euclidean is not ideal for high-dimension data. But it is commonly used.
DM <- philentropy::distance(t(rlog_TPM_renamed), use.row.names = TRUE, method = "euclidean")
## Metric: 'euclidean'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]
M_melt <- reshape2::melt(DM2)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus stages",
y = "median(distance with Fd stages)",
title = "Euclidean distance (rlog)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fd, y = median_corr, colour = Fs, group = Fs)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus distichus stages",
y = "median(distance with Fs stages)",
title = "Euclidean distance (rlog)",
colour = "Fs stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Jensen-Shannon distance is a statistical distance metric that measures the similarity between two probability distributions. It is often used in data analysis and machine learning for clustering, classification, and anomaly detection tasks.
mat_normalized <- apply(rlog_TPM_renamed, 2, function(x) x/sum(x)) %>% as.data.frame()
# colSums(mat_normalized)
DM <- philentropy::distance(t(mat_normalized), use.row.names = TRUE, method = "jensen-shannon")
## Metric: 'jensen-shannon' using unit: 'log'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]
DM3 <- sqrt(DM2)
M_melt <- reshape2::melt(DM3)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus samples",
y = "median(distance with Fd stages)",
title = "JSD metric (norm. rlog)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fd, y = median_corr, colour = Fs, group = Fs)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus distichus samples",
y = "median(distance with Fs stages)",
title = "JSD metric (norm. rlog)",
colour = "Fs stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
log2_TPM_renamed <- log2_TPM
base::colnames(log2_TPM_renamed) <- combined_metatable$LibLab
Linear relationship
M = cor(x = log2_TPM_renamed[,ncol_Fs],
y = log2_TPM_renamed[,ncol_Fd],
method = "pearson")
M_melt <- reshape2::melt(M)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = 1 - median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus stages",
y = "1 - median(correlation with Fd stages)",
title = "Pearson correlation (log2)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Monotonic relationship
log2_TPM_renamed <- log2_TPM
base::colnames(log2_TPM_renamed) <- combined_metatable$LibLab
M = cor(x = log2_TPM_renamed[,ncol_Fs],
y = log2_TPM_renamed[,ncol_Fd],
method = "spearman")
M_melt <- reshape2::melt(M)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = 1 - median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus stages",
y = "1 - median(correlation with Fd stages)",
title = "Spearman correlation (log2)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Manhattan distance is a distance metric used to measure the distance between two points in a grid-like system, such as a city block or a chessboard. It is calculated as the sum of the absolute differences between the coordinates of the two points. The term “Manhattan” is used because the grid-like structure and the right-angled turns of the streets in Manhattan, New York, resemble a chessboard. The Manhattan distance is also known as the L1 norm or taxicab distance. It is commonly used in machine learning and data science to measure similarity between data points or to cluster data.
It is L_{1} norm unlike Euclidean L_{2} norm.
Because it uses absolute values instead of exponentiation and rooting, it is said to be more robust to outliers compared to the euclidean distance. Additionally, the modulus is much faster to compute than exponentiation. It is a special case of the Minkowski distance.
DM <- philentropy::distance(t(log2_TPM_renamed), use.row.names = TRUE, method = "manhattan")
## Metric: 'manhattan'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]
M_melt <- reshape2::melt(DM2)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus stages",
y = "median(distance with Fd stages)",
title = "Manhattan distance (log2)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Euclidean distance is a measure of the straight-line distance between two points in a multi-dimensional space. It is calculated as the square root of the sum of the squared differences between the corresponding coordinates of the two points. The Euclidean distance is commonly used in clustering and classification algorithms, as well as in distance-based data analysis and visualization techniques.
Euclidean is not ideal for high-dimension data. But it is commonly used.
DM <- philentropy::distance(t(log2_TPM_renamed), use.row.names = TRUE, method = "euclidean")
## Metric: 'euclidean'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]
M_melt <- reshape2::melt(DM2)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus stages",
y = "median(distance with Fd stages)",
title = "Euclidean distance (log2)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Jensen-Shannon distance is a statistical distance metric that measures the similarity between two probability distributions. It is often used in data analysis and machine learning for clustering, classification, and anomaly detection tasks.
mat_normalized <- apply(log2_TPM_renamed, 2, function(x) x/sum(x)) %>% as.data.frame()
# colSums(mat_normalized)
DM <- philentropy::distance(t(mat_normalized), use.row.names = TRUE, method = "jensen-shannon")
## Metric: 'jensen-shannon' using unit: 'log'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]
DM3 <- sqrt(DM2)
M_melt <- reshape2::melt(DM3)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus samples",
y = "median(distance with Fd stages)",
title = "JSD metric (norm. log2)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
sqrt_TPM_renamed <- sqrt_TPM
base::colnames(sqrt_TPM_renamed) <- combined_metatable$LibLab
Linear relationship
M = cor(x = sqrt_TPM_renamed[,ncol_Fs],
y = sqrt_TPM_renamed[,ncol_Fd],
method = "pearson")
M_melt <- reshape2::melt(M)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = 1 - median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus stages",
y = "1 - median(correlation with Fd stages)",
title = "Pearson correlation (sqrt)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Monotonic relationship
sqrt_TPM_renamed <- sqrt_TPM
base::colnames(sqrt_TPM_renamed) <- combined_metatable$LibLab
M = cor(x = sqrt_TPM_renamed[,ncol_Fs],
y = sqrt_TPM_renamed[,ncol_Fd],
method = "spearman")
M_melt <- reshape2::melt(M)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = 1 - median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus stages",
y = "1 - median(correlation with Fd stages)",
title = "Spearman correlation (sqrt)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Manhattan distance is a distance metric used to measure the distance between two points in a grid-like system, such as a city block or a chessboard. It is calculated as the sum of the absolute differences between the coordinates of the two points. The term “Manhattan” is used because the grid-like structure and the right-angled turns of the streets in Manhattan, New York, resemble a chessboard. The Manhattan distance is also known as the L1 norm or taxicab distance. It is commonly used in machine learning and data science to measure similarity between data points or to cluster data.
It is L_{1} norm unlike Euclidean L_{2} norm.
Because it uses absolute values instead of exponentiation and rooting, it is said to be more robust to outliers compared to the euclidean distance. Additionally, the modulus is much faster to compute than exponentiation. It is a special case of the Minkowski distance.
DM <- philentropy::distance(t(sqrt_TPM_renamed), use.row.names = TRUE, method = "manhattan")
## Metric: 'manhattan'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]
M_melt <- reshape2::melt(DM2)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus stages",
y = "median(distance with Fd stages)",
title = "Euclidean distance (sqrt)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Euclidean distance is a measure of the straight-line distance between two points in a multi-dimensional space. It is calculated as the square root of the sum of the squared differences between the corresponding coordinates of the two points. The Euclidean distance is commonly used in clustering and classification algorithms, as well as in distance-based data analysis and visualization techniques.
Euclidean is not ideal for high-dimension data. But it is commonly used.
DM <- philentropy::distance(t(sqrt_TPM_renamed), use.row.names = TRUE, method = "euclidean")
## Metric: 'euclidean'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]
M_melt <- reshape2::melt(DM2)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus stages",
y = "median(distance with Fd stages)",
title = "Euclidean distance (sqrt)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Jensen-Shannon distance is a statistical distance metric that measures the similarity between two probability distributions. It is often used in data analysis and machine learning for clustering, classification, and anomaly detection tasks.
mat_normalized <- apply(sqrt_TPM_renamed, 2, function(x) x/sum(x)) %>% as.data.frame()
# colSums(mat_normalized)
DM <- philentropy::distance(t(mat_normalized), use.row.names = TRUE, method = "jensen-shannon")
## Metric: 'jensen-shannon' using unit: 'log'; comparing: 34 vectors.
DM2 <- DM[ncol_Fs, ncol_Fd]
DM3 <- sqrt(DM2)
M_melt <- reshape2::melt(DM3)
tmp1 <- sample_info_fd_embryo %>%
select(Var2 = LibLab, Stage) %>%
full_join(M_melt, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var2)`
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2 %>% group_by(Fs, Fd) %>% summarise(median_corr = median(value)) %>%
ggplot(aes(x = Fs, y = median_corr, colour = Fd, group = Fd)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
theme_classic() +
labs(x = "Fucus serratus samples",
y = "median(distance with Fd stages)",
title = "JSD metric (norm. sqrt)",
colour = "Fd stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
Here we use JSD metric.
sample_info_ec_proc <-
sample_info_ec %>%
dplyr::relocate(LibName) %>%
dplyr::mutate(LibLab = str_c(Species, "_", Sex, "_", Stage, "_", Replicate))
# duplicated(sample_info_ec_proc$LibLab)
combined_metatable_inclEc <-
rbind(combined_metatable, sample_info_ec_proc)
merged_TPM_inclEc_pre <-
Ec_abundance %>%
dplyr::select(1, sample_info_ec_proc$SampleName)
merged_TPM_inclEc_pre <-
merged_TPM_pre %>%
merge(merged_TPM_inclEc_pre)
merged_TPM_inclEc <-
data.frame(row.names = merged_TPM_inclEc_pre[,1], merged_TPM_inclEc_pre[,-1], check.names = FALSE)
merged_TPM.filt <-
as.matrix(merged_TPM_inclEc) %>%
selectGenes(min.count=10)
We keep 1620 OGs out of 1979 OGs.
log2_TPM <- log2(merged_TPM.filt + 1)
log2_TPM_renamed <- log2_TPM
base::colnames(log2_TPM_renamed) <- combined_metatable_inclEc$LibLab
ncol_Fs <- grep( "Fs" , colnames( log2_TPM_renamed ) )
ncol_Fd <- grep( "Fd" , colnames( log2_TPM_renamed ) )
ncol_Ec <- grep( "Ec_" , colnames( log2_TPM_renamed ) )
M = cor(x = log2_TPM_renamed[,ncol_Fs],
y = log2_TPM_renamed[,ncol_Ec],
method = "pearson")
M_melt <- reshape2::melt(M)
tmp1 <- sample_info_ec_proc %>%
dplyr::select(Var2 = LibLab, Stage, Sex) %>%
dplyr::mutate(StageSex = str_c(Stage, "_", Sex)) %>%
dplyr::full_join(M_melt, multiple = "all") %>%
dplyr::rename(Ec = StageSex) %>%
dplyr::select(-Stage, -Sex)
## Joining with `by = join_by(Var2)`
## Warning in dplyr::full_join(., M_melt, multiple = "all"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 397 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Ec <- factor(tmp2$Ec, levels = unique(tmp2$Ec))
tmp2 %>% dplyr::filter(!stringr::str_detect(Ec, 'spore|gamete')) %>% group_by(Fs, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fs, y = 1 - median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus serratus stages",
y = "1 - median(correlation with Ec stages)",
title = "Pearson correlation (log2) [only multicellular]",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fs, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fs, y = 1 - median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus serratus stages",
y = "1 - median(correlation with Ec stages)",
title = "Pearson correlation (log2)",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
M = cor(x = log2_TPM_renamed[,ncol_Fd],
y = log2_TPM_renamed[,ncol_Ec],
method = "pearson")
M_melt <- reshape2::melt(M)
tmp1 <- sample_info_ec_proc %>%
dplyr::select(Var2 = LibLab, Stage, Sex) %>%
dplyr::mutate(StageSex = str_c(Stage, "_", Sex)) %>%
dplyr::full_join(M_melt, multiple = "all") %>%
dplyr::rename(Ec = StageSex) %>%
dplyr::select(-Stage, -Sex)
## Joining with `by = join_by(Var2)`
## Warning in dplyr::full_join(., M_melt, multiple = "all"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 353 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
tmp2 <- sample_info_fd_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2$Ec <- factor(tmp2$Ec, levels = unique(tmp2$Ec))
tmp2 %>% dplyr::filter(!stringr::str_detect(Ec, 'spore|gamete')) %>% group_by(Fd, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fd, y = 1 - median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus distichus stages",
y = "1 - median(correlation with Ec stages)",
title = "Pearson correlation (log2) [only multicellular]",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fd'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fd, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fd, y = 1 - median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus distichus stages",
y = "1 - median(correlation with Ec stages)",
title = "Pearson correlation (log2)",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fd'. You can override using the `.groups`
## argument.
M = cor(x = log2_TPM_renamed[,ncol_Fs],
y = log2_TPM_renamed[,ncol_Ec],
method = "spearman")
M_melt <- reshape2::melt(M)
tmp1 <- sample_info_ec_proc %>%
dplyr::select(Var2 = LibLab, Stage, Sex) %>%
dplyr::mutate(StageSex = str_c(Stage, "_", Sex)) %>%
dplyr::full_join(M_melt, multiple = "all") %>%
dplyr::rename(Ec = StageSex) %>%
dplyr::select(-Stage, -Sex)
## Joining with `by = join_by(Var2)`
## Warning in dplyr::full_join(., M_melt, multiple = "all"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 397 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Ec <- factor(tmp2$Ec, levels = unique(tmp2$Ec))
tmp2 %>% dplyr::filter(!stringr::str_detect(Ec, 'spore|gamete')) %>% group_by(Fs, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fs, y = 1 - median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus serratus stages",
y = "1 - median(correlation with Ec stages)",
title = "Spearman correlation (log2) [only multicellular]",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fs, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fs, y = 1 - median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus serratus stages",
y = "1 - median(correlation with Ec stages)",
title = "Spearman correlation (log2)",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
M = cor(x = log2_TPM_renamed[,ncol_Fd],
y = log2_TPM_renamed[,ncol_Ec],
method = "spearman")
M_melt <- reshape2::melt(M)
tmp1 <- sample_info_ec_proc %>%
dplyr::select(Var2 = LibLab, Stage, Sex) %>%
dplyr::mutate(StageSex = str_c(Stage, "_", Sex)) %>%
dplyr::full_join(M_melt, multiple = "all") %>%
dplyr::rename(Ec = StageSex) %>%
dplyr::select(-Stage, -Sex)
## Joining with `by = join_by(Var2)`
## Warning in dplyr::full_join(., M_melt, multiple = "all"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 353 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
tmp2 <- sample_info_fd_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2$Ec <- factor(tmp2$Ec, levels = unique(tmp2$Ec))
tmp2 %>% dplyr::filter(!stringr::str_detect(Ec, 'spore|gamete')) %>% group_by(Fd, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fd, y = 1 - median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus distichus stages",
y = "1 - median(correlation with Ec stages)",
title = "Spearman correlation (log2) [only multicellular]",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fd'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fd, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fd, y = 1 - median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus distichus stages",
y = "1 - median(correlation with Ec stages)",
title = "Spearman correlation (log2)",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fd'. You can override using the `.groups`
## argument.
mat_normalized <- apply(log2_TPM_renamed, 2, function(x) x/sum(x)) %>% as.data.frame()
# colSums(mat_normalized)
DM <- philentropy::distance(t(mat_normalized), use.row.names = TRUE, method = "jensen-shannon")
## Metric: 'jensen-shannon' using unit: 'log'; comparing: 91 vectors.
DM2 <- DM[ncol_Fs, ncol_Ec]
DM3 <- sqrt(DM2)
M_melt <- reshape2::melt(DM3)
tmp1 <- sample_info_ec_proc %>%
dplyr::select(Var2 = LibLab, Stage, Sex) %>%
dplyr::mutate(StageSex = str_c(Stage, "_", Sex)) %>%
dplyr::full_join(M_melt, multiple = "all") %>%
dplyr::rename(Ec = StageSex) %>%
dplyr::select(-Stage, -Sex)
## Joining with `by = join_by(Var2)`
## Warning in dplyr::full_join(., M_melt, multiple = "all"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 397 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Ec <- factor(tmp2$Ec, levels = unique(tmp2$Ec))
tmp2 %>% dplyr::filter(!stringr::str_detect(Ec, 'spore|gamete')) %>% group_by(Fs, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fs, y = median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus serratus stages",
y = "median(distance with Ec stages)",
title = "JSD metric (norm. log2) [only multicellular]",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fs, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fs, y = median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus serratus stages",
y = "median(distance with Ec stages)",
title = "JSD metric (norm. log2)",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
DM2 <- DM[ncol_Fd, ncol_Ec]
DM3 <- sqrt(DM2)
M_melt <- reshape2::melt(DM3)
tmp1 <- sample_info_ec_proc %>%
dplyr::select(Var2 = LibLab, Stage, Sex) %>%
dplyr::mutate(StageSex = str_c(Stage, "_", Sex)) %>%
dplyr::full_join(M_melt, multiple = "all") %>%
dplyr::rename(Ec = StageSex) %>%
dplyr::select(-Stage, -Sex)
## Joining with `by = join_by(Var2)`
## Warning in dplyr::full_join(., M_melt, multiple = "all"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 353 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
tmp2 <- sample_info_fd_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2$Ec <- factor(tmp2$Ec, levels = unique(tmp2$Ec))
tmp2 %>% dplyr::filter(!stringr::str_detect(Ec, 'spore|gamete')) %>% group_by(Fd, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fd, y = median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus distichus stages",
y = "median(distance with Ec stages)",
title = "JSD metric (norm. log2) [only multicellular]",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fd'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fd, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fd, y = median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus distichus stages",
y = "median(distance with Ec stages)",
title = "JSD metric (norm. log2)",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fd'. You can override using the `.groups`
## argument.
DM <- philentropy::distance(t(log2_TPM_renamed), use.row.names = TRUE, method = "manhattan")
## Metric: 'manhattan'; comparing: 91 vectors.
DM2 <- DM[ncol_Fs, ncol_Ec]
M_melt <- reshape2::melt(DM2)
tmp1 <- sample_info_ec_proc %>%
dplyr::select(Var2 = LibLab, Stage, Sex) %>%
dplyr::mutate(StageSex = str_c(Stage, "_", Sex)) %>%
dplyr::full_join(M_melt, multiple = "all") %>%
dplyr::rename(Ec = StageSex) %>%
dplyr::select(-Stage, -Sex)
## Joining with `by = join_by(Var2)`
## Warning in dplyr::full_join(., M_melt, multiple = "all"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 397 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
tmp2 <- sample_info_fs_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fs = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fs <- factor(tmp2$Fs, levels = unique(tmp2$Fs))
tmp2$Ec <- factor(tmp2$Ec, levels = unique(tmp2$Ec))
tmp2 %>% dplyr::filter(!stringr::str_detect(Ec, 'spore|gamete')) %>% group_by(Fs, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fs, y = median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus serratus stages",
y = "median(distance with Ec stages)",
title = "Manhattan distance (log2) [only multicellular]",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fs, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fs, y = median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus serratus stages",
y = "median(distance with Ec stages)",
title = "Manhattan distance (log2)",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fs'. You can override using the `.groups`
## argument.
DM2 <- DM[ncol_Fd, ncol_Ec]
M_melt <- reshape2::melt(DM2)
tmp1 <- sample_info_ec_proc %>%
dplyr::select(Var2 = LibLab, Stage, Sex) %>%
dplyr::mutate(StageSex = str_c(Stage, "_", Sex)) %>%
dplyr::full_join(M_melt, multiple = "all") %>%
dplyr::rename(Ec = StageSex) %>%
dplyr::select(-Stage, -Sex)
## Joining with `by = join_by(Var2)`
## Warning in dplyr::full_join(., M_melt, multiple = "all"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 353 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
tmp2 <- sample_info_fd_embryo %>%
select(Var1 = LibLab, Stage) %>%
full_join(tmp1, multiple = "all") %>%
dplyr::rename(Fd = Stage)
## Joining with `by = join_by(Var1)`
tmp2$Fd <- factor(tmp2$Fd, levels = unique(tmp2$Fd))
tmp2$Ec <- factor(tmp2$Ec, levels = unique(tmp2$Ec))
tmp2 %>% dplyr::filter(!stringr::str_detect(Ec, 'spore|gamete')) %>% group_by(Fd, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fd, y = median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus distichus stages",
y = "median(distance with Ec stages)",
title = "Manhattan distance (log2) [only multicellular]",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fd'. You can override using the `.groups`
## argument.
tmp2 %>% group_by(Fd, Ec) %>% summarise(median_corr = median(value)) %>% tidyr::separate(Ec, into = c("Ec", "Sex"), sep="_(?=[^_]+$)") %>%
ggplot(aes(x = Fd, y = median_corr, colour = Ec, group = Ec)) +
geom_point(size = 3) +
geom_line(linewidth = 2, alpha = 0.5) +
ggplot2::facet_grid(~ Sex) +
labs(x = "Fucus distichus stages",
y = "median(distance with Ec stages)",
title = "Manhattan distance (log2)",
colour = "Ectocarpus stages")
## `summarise()` has grouped output by 'Fd'. You can override using the `.groups`
## argument.
devtools::session_info()
## ─ Session info ───────────────────────────────────────────────────────────────
## setting value
## version R version 4.2.2 (2022-10-31)
## os macOS Big Sur ... 10.16
## system x86_64, darwin17.0
## ui X11
## language (EN)
## collate en_US.UTF-8
## ctype en_US.UTF-8
## tz Europe/Berlin
## date 2023-11-22
## pandoc 3.1.6.2 @ /usr/local/bin/ (via rmarkdown)
##
## ─ Packages ───────────────────────────────────────────────────────────────────
## package * version date (UTC) lib source
## annotate 1.76.0 2022-11-01 [1] Bioconductor
## AnnotationDbi 1.60.2 2023-03-10 [1] Bioconductor
## Biobase 2.58.0 2022-11-01 [1] Bioconductor
## BiocGenerics 0.44.0 2022-11-01 [1] Bioconductor
## BiocParallel 1.32.6 2023-03-17 [1] Bioconductor
## Biostrings 2.66.0 2022-11-01 [1] Bioconductor
## bit 4.0.5 2022-11-15 [1] CRAN (R 4.2.0)
## bit64 4.0.5 2020-08-30 [1] CRAN (R 4.2.0)
## bitops 1.0-7 2021-04-24 [1] CRAN (R 4.2.0)
## blob 1.2.4 2023-03-17 [1] CRAN (R 4.2.0)
## bslib 0.5.1 2023-08-11 [1] CRAN (R 4.2.0)
## cachem 1.0.8 2023-05-01 [1] CRAN (R 4.2.0)
## callr 3.7.3 2022-11-02 [1] CRAN (R 4.2.0)
## cli 3.6.1 2023-03-23 [1] CRAN (R 4.2.0)
## codetools 0.2-19 2023-02-01 [1] CRAN (R 4.2.0)
## colorspace 2.1-0 2023-01-23 [1] CRAN (R 4.2.0)
## crayon 1.5.2 2022-09-29 [1] CRAN (R 4.2.0)
## DBI 1.1.3 2022-06-18 [1] CRAN (R 4.2.0)
## DelayedArray 0.24.0 2022-11-01 [1] Bioconductor
## DESeq2 1.38.3 2023-01-19 [1] Bioconductor
## devtools 2.4.5 2022-10-11 [1] CRAN (R 4.2.0)
## digest 0.6.33 2023-07-07 [1] CRAN (R 4.2.0)
## dplyr * 1.1.3 2023-09-03 [1] CRAN (R 4.2.0)
## ellipsis 0.3.2 2021-04-29 [1] CRAN (R 4.2.0)
## evaluate 0.22 2023-09-29 [1] CRAN (R 4.2.2)
## fansi 1.0.5 2023-10-08 [1] CRAN (R 4.2.2)
## farver 2.1.1 2022-07-06 [1] CRAN (R 4.2.0)
## fastmap 1.1.1 2023-02-24 [1] CRAN (R 4.2.0)
## forcats * 1.0.0 2023-01-29 [1] CRAN (R 4.2.0)
## fs 1.6.3 2023-07-20 [1] CRAN (R 4.2.0)
## geneplotter 1.76.0 2022-11-01 [1] Bioconductor
## generics 0.1.3 2022-07-05 [1] CRAN (R 4.2.0)
## GenomeInfoDb 1.34.9 2023-02-02 [1] Bioconductor
## GenomeInfoDbData 1.2.9 2023-05-04 [1] Bioconductor
## GenomicRanges 1.50.2 2022-12-16 [1] Bioconductor
## ggfortify * 0.4.16 2023-03-20 [1] CRAN (R 4.2.0)
## ggplot2 * 3.4.4 2023-10-12 [1] CRAN (R 4.2.2)
## glue 1.6.2 2022-02-24 [1] CRAN (R 4.2.0)
## gridExtra 2.3 2017-09-09 [1] CRAN (R 4.2.0)
## gtable 0.3.4 2023-08-21 [1] CRAN (R 4.2.0)
## hms 1.1.3 2023-03-21 [1] CRAN (R 4.2.0)
## htmltools 0.5.6.1 2023-10-06 [1] CRAN (R 4.2.2)
## htmlwidgets 1.6.2 2023-03-17 [1] CRAN (R 4.2.0)
## httpuv 1.6.11 2023-05-11 [1] CRAN (R 4.2.2)
## httr 1.4.7 2023-08-15 [1] CRAN (R 4.2.0)
## IRanges 2.32.0 2022-11-01 [1] Bioconductor
## jquerylib 0.1.4 2021-04-26 [1] CRAN (R 4.2.0)
## jsonlite 1.8.7 2023-06-29 [1] CRAN (R 4.2.0)
## KEGGREST 1.38.0 2022-11-01 [1] Bioconductor
## knitr 1.44 2023-09-11 [1] CRAN (R 4.2.0)
## labeling 0.4.3 2023-08-29 [1] CRAN (R 4.2.0)
## later 1.3.1 2023-05-02 [1] CRAN (R 4.2.2)
## lattice 0.21-9 2023-10-01 [1] CRAN (R 4.2.2)
## lifecycle 1.0.3 2022-10-07 [1] CRAN (R 4.2.0)
## locfit 1.5-9.8 2023-06-11 [1] CRAN (R 4.2.0)
## lubridate * 1.9.3 2023-09-27 [1] CRAN (R 4.2.0)
## magrittr 2.0.3 2022-03-30 [1] CRAN (R 4.2.0)
## Matrix 1.5-4.1 2023-05-18 [1] CRAN (R 4.2.0)
## MatrixGenerics 1.10.0 2022-11-01 [1] Bioconductor
## matrixStats 1.0.0 2023-06-02 [1] CRAN (R 4.2.0)
## memoise 2.0.1 2021-11-26 [1] CRAN (R 4.2.0)
## mime 0.12 2021-09-28 [1] CRAN (R 4.2.0)
## miniUI 0.1.1.1 2018-05-18 [1] CRAN (R 4.2.0)
## munsell 0.5.0 2018-06-12 [1] CRAN (R 4.2.0)
## philentropy * 0.7.0 2022-11-05 [1] CRAN (R 4.2.0)
## pillar 1.9.0 2023-03-22 [1] CRAN (R 4.2.0)
## pkgbuild 1.4.2 2023-06-26 [1] CRAN (R 4.2.0)
## pkgconfig 2.0.3 2019-09-22 [1] CRAN (R 4.2.0)
## pkgload 1.3.3 2023-09-22 [1] CRAN (R 4.2.0)
## plyr 1.8.9 2023-10-02 [1] CRAN (R 4.2.2)
## png 0.1-8 2022-11-29 [1] CRAN (R 4.2.0)
## prettyunits 1.2.0 2023-09-24 [1] CRAN (R 4.2.0)
## processx 3.8.2 2023-06-30 [1] CRAN (R 4.2.0)
## profvis 0.3.8 2023-05-02 [1] CRAN (R 4.2.0)
## promises 1.2.1 2023-08-10 [1] CRAN (R 4.2.2)
## ps 1.7.5 2023-04-18 [1] CRAN (R 4.2.0)
## purrr * 1.0.2 2023-08-10 [1] CRAN (R 4.2.2)
## R6 2.5.1 2021-08-19 [1] CRAN (R 4.2.0)
## RColorBrewer 1.1-3 2022-04-03 [1] CRAN (R 4.2.0)
## Rcpp 1.0.11 2023-07-06 [1] CRAN (R 4.2.0)
## RCurl 1.98-1.12 2023-03-27 [1] CRAN (R 4.2.0)
## readr * 2.1.4 2023-02-10 [1] CRAN (R 4.2.0)
## remotes 2.4.2.1 2023-07-18 [1] CRAN (R 4.2.2)
## reshape2 1.4.4 2020-04-09 [1] CRAN (R 4.2.0)
## rlang 1.1.1 2023-04-28 [1] CRAN (R 4.2.0)
## rmarkdown 2.25 2023-09-18 [1] CRAN (R 4.2.2)
## RSQLite 2.3.1 2023-04-03 [1] CRAN (R 4.2.0)
## rstudioapi 0.15.0 2023-07-07 [1] CRAN (R 4.2.0)
## S4Vectors 0.36.2 2023-02-26 [1] Bioconductor
## sass 0.4.7 2023-07-15 [1] CRAN (R 4.2.0)
## scales 1.2.1 2022-08-20 [1] CRAN (R 4.2.0)
## sessioninfo 1.2.2 2021-12-06 [1] CRAN (R 4.2.0)
## shiny 1.7.5.1 2023-10-14 [1] CRAN (R 4.2.2)
## stringi 1.7.12 2023-01-11 [1] CRAN (R 4.2.0)
## stringr * 1.5.0 2022-12-02 [1] CRAN (R 4.2.0)
## SummarizedExperiment 1.28.0 2022-11-01 [1] Bioconductor
## tibble * 3.2.1 2023-03-20 [1] CRAN (R 4.2.0)
## tidyr * 1.3.0 2023-01-24 [1] CRAN (R 4.2.0)
## tidyselect 1.2.0 2022-10-10 [1] CRAN (R 4.2.0)
## tidyverse * 2.0.0 2023-02-22 [1] CRAN (R 4.2.0)
## timechange 0.2.0 2023-01-11 [1] CRAN (R 4.2.0)
## tzdb 0.4.0 2023-05-12 [1] CRAN (R 4.2.2)
## urlchecker 1.0.1 2021-11-30 [1] CRAN (R 4.2.0)
## usethis 2.2.2 2023-07-06 [1] CRAN (R 4.2.0)
## utf8 1.2.3 2023-01-31 [1] CRAN (R 4.2.0)
## vctrs 0.6.4 2023-10-12 [1] CRAN (R 4.2.2)
## vroom 1.6.4 2023-10-02 [1] CRAN (R 4.2.2)
## withr 2.5.1 2023-09-26 [1] CRAN (R 4.2.0)
## xfun 0.40 2023-08-09 [1] CRAN (R 4.2.2)
## XML 3.99-0.14 2023-03-19 [1] CRAN (R 4.2.0)
## xtable 1.8-4 2019-04-21 [1] CRAN (R 4.2.0)
## XVector 0.38.0 2022-11-01 [1] Bioconductor
## yaml 2.3.7 2023-01-23 [1] CRAN (R 4.2.0)
## zlibbioc 1.44.0 2022-11-01 [1] Bioconductor
##
## [1] /Library/Frameworks/R.framework/Versions/4.2/Resources/library
##
## ──────────────────────────────────────────────────────────────────────────────